The data consists of vegetation % cover by functional group from across CONUS (from AIM, FIA, LANDFIRE, and RAP), as well as climate variables from DayMet, which have been aggregated into mean interannual conditions accross multiple temporal windows.
User defined parameters
print(params)
## $test_run
## [1] FALSE
##
## $save_figs
## [1] FALSE
##
## $response
## [1] "TotalGramCover_dec"
##
## $hmod
## [1] FALSE
##
## $s
## [1] "TotalGramCover"
##
## $sample_group
## [1] 1
##
## $byRegion
## [1] TRUE
# set to true if want to run for a limited number of rows (i.e. for code testing)
test_run <- params$test_run
save_figs <- params$save_figs
hmod <- params$hmod # whether to include human modification in the model
# by changing the sample_group the model can be fit to a completely different set of rows
sample_group <- params$sample_group
response <- params$response
# _ann defines this model as being built on annual data
s <- params$s # string to paste to file names e.g., that defines the interactions in the model
# such as (summer ppt * MAT) and annuals*temperature interactions
fit_sample <- TRUE # fit model to a sample of the data
n_train <- 5e4 # sample size of the training data
n_test <- 1e6 # sample size of the testing data (if this is too big the decile dotplot code throws memory errors)
byRegion <- params$byRegion
# set option so resampled dataset created here reproduces earlier runs of this code with dplyr 1.0.10
source("../../../Functions/glmTransformsIterates.R")
source("../../../Functions/transformPreds.R")
source("../../../Functions/StepBeta_mine.R")
#source("src/fig_params.R")
#source("src/modeling_functions.R")
library(terra)
library(tidyterra)
library(sf)
library(caret)
library(betareg)
library(tidyverse)
library(GGally) # for ggpairs()
library(pdp) # for partial dependence plots
library(gridExtra)
library(knitr)
library(patchwork) # for figure insets etc.
library(ggtext)
library(StepBeta)
theme_set(theme_classic())
Data compiled in the prepDataForModels.R script
modDat <- readRDS("../../../data/DataForModels_withSubEcoreg.rds")
## if it's herbs, remove LDC data right now (has some weirdness that's skewing the data... need to figure that out)
if (s == "HerbCover") {
modDat <- modDat[modDat$Source != "LANDFIRE",]
}
set.seed(1234)
modDat_1 <- modDat %>%
mutate(Lon = st_coordinates(.)[,1],
Lat = st_coordinates(.)[,2]) %>%
st_drop_geometry() %>%
filter(!is.na(newRegion))
# small dataset for if testing the data
if(test_run) {
modDat_1 <- slice_sample(modDat_1, n = 1e5)
}
For now, not doing any resampling
set.seed(1234)
pred_vars <- c("swe_meanAnnAvg_30yr", "tmean_meanAnnAvg_30yr", "prcp_meanAnnTotal_30yr", "PrecipTempCorr_meanAnnAvg_30yr", "isothermality_meanAnnAvg_30yr", "annWetDegDays_meanAnnAvg_30yr")
# removed VPD, since it's highly correlated w/ tmean and prcp
names(pred_vars) <- pred_vars
# predictor vars are the same in both dfs
## remove rows for data that have NAs in the predictors (lag starts before range of DayMet data)
modDat_1 <- modDat_1 %>%
drop_na(swe_meanAnnAvg_30yr, tmean_meanAnnAvg_30yr, prcp_meanAnnTotal_30yr,
precip_Seasonality_meanAnnAvg_30yr, PrecipTempCorr_meanAnnAvg_30yr,
isothermality_meanAnnAvg_30yr, all_of(response))
df_pred <- modDat_1[, pred_vars]
Break the data into three groups for model fitting based on lumped ecoregions
if (byRegion == TRUE) {
modDat_westForest <- modDat_1 %>%
filter(newRegion == "westForest")
modDat_eastForest <- modDat_1 %>%
filter(newRegion == "eastForest")
modDat_shrubGrass <- modDat_1 %>%
filter(newRegion == "dryShrubGrass")
}
Training data
if (byRegion == TRUE) {
## for western forests
wf_sample <- if(fit_sample & !test_run) {
reordered <- slice_sample(modDat_westForest, prop = 1)
low <- (sample_group - 1)*n_train + 1 # first row (of reordered data) to get
high <- low + n_train - 1 # last row
if(high > nrow(modDat_westForest)) {
warning(paste0("n_train size due to low sample size (i.e. < ", n_train,")"))
n_train <- n_train*.8
low <- (sample_group - 1)*n_train + 1 # first row (of reordered data) to get
high <- low + n_train - 1 # last row
#stop('trying to sample from rows that dont exist')
}
reordered[low:high, ]
} else {
modDat_westForest
}
wf_test <- if(fit_sample & !test_run &
# antijoin only works if there are enough rows that meet
# that criterion, i.e. if wf_sample contains most of the data i
# doesnt' work
(nrow(modDat_westForest) - nrow(wf_sample) > n_test)) {
modDat_westForest %>%
anti_join(wf_sample, by = c("cell_num", "year")) %>%
slice_sample(n = n_test)
} else {
modDat_westForest %>%
slice_sample(n = n_test)
}
# small sample for certain plots
wf_small <- slice_sample(modDat_westForest, n = 1e5)
# for eastern forests
ef_sample <- if(fit_sample & !test_run) {
reordered <- slice_sample(modDat_eastForest, prop = 1)
low <- (sample_group - 1)*n_train + 1 # first row (of reordered data) to get
high <- low + n_train - 1 # last row
if(high > nrow(modDat_eastForest)) {
warning(paste0("n_train size due to low sample size (i.e. < ", n_train,")"))
n_train <- n_train*.8
low <- (sample_group - 1)*n_train + 1 # first row (of reordered data) to get
high <- low + n_train - 1 # last row
#stop('trying to sample from rows that dont exist')
}
reordered[low:high, ]
} else {
modDat_eastForest
}
ef_test <- if(fit_sample & !test_run &
# antijoin only works if there are enough rows that meet
# that criterion, i.e. if ef_sample contains most of the data i
# doesnt' work
(nrow(modDat_eastForest) - nrow(ef_sample) > n_test)) {
modDat_eastForest %>%
anti_join(ef_sample, by = c("cell_num", "year")) %>%
slice_sample(n = n_test)
} else {
modDat_eastForest %>%
slice_sample(n = n_test)
}
# small sample for certain plots
ef_small <- slice_sample(modDat_eastForest, n = 1e5)
## for grass/shrubs
g_sample <- if(fit_sample & !test_run) {
reordered <- slice_sample(modDat_shrubGrass, prop = 1)
low <- (sample_group - 1)*n_train + 1 # first row (of reordered data) to get
high <- low + n_train - 1 # last row
if(high > nrow(modDat_shrubGrass)) {
warning(paste0("n_train size due to low sample size (i.e. < ", n_train,")"))
n_train <- n_train*.8
low <- (sample_group - 1)*n_train + 1 # first row (of reordered data) to get
high <- low + n_train - 1 # last row
#stop('trying to sample from rows that dont exist')
}
reordered[low:high, ]
} else {
modDat_shrubGrass
}
g_test <- if(fit_sample & !test_run &
# antijoin only works if there are enough rows that meet
# that criterion, i.e. if g_sample contains most of the data i
# doesnt' work
(nrow(modDat_shrubGrass) - nrow(g_sample) > n_test)) {
modDat_shrubGrass %>%
anti_join(g_sample, by = c("cell_num", "year")) %>%
slice_sample(n = n_test)
} else {
modDat_shrubGrass %>%
slice_sample(n = n_test)
}
# small sample for certain plots
g_small <- slice_sample(modDat_shrubGrass, n = 1e5)
## do full dataset as well
df_sample <- if(fit_sample & !test_run) {
reordered <- slice_sample(modDat_1, prop = 1)
low <- (sample_group - 1)*n_train + 1 # first row (of reordered data) to get
high <- low + n_train - 1 # last row
if(high > nrow(modDat_1)) {
warning(paste0("n_train size due to low sample size (i.e. < ", n_train,")"))
n_train <- n_train*.8
low <- (sample_group - 1)*n_train + 1 # first row (of reordered data) to get
high <- low + n_train - 1 # last row
#stop('trying to sample from rows that dont exist')
}
reordered[low:high, ]
} else {
modDat_1
}
df_test <- if(fit_sample & !test_run &
# antijoin only works if there are enough rows that meet
# that criterion, i.e. if df_sample contains most of the data i
# doesnt' work
(nrow(modDat_1) - nrow(df_sample) > n_test)) {
modDat_1 %>%
anti_join(df_sample, by = c("cell_num", "year")) %>%
slice_sample(n = n_test)
} else {
modDat_1 %>%
slice_sample(n = n_test)
}
# small sample for certain plots
df_small <- slice_sample(modDat_1, n = 1e5)
} else if (byRegion == FALSE) {
df_sample <- if(fit_sample & !test_run) {
reordered <- slice_sample(modDat_1, prop = 1)
low <- (sample_group - 1)*n_train + 1 # first row (of reordered data) to get
high <- low + n_train - 1 # last row
if(high > nrow(modDat_1)) {
warning(paste0("n_train size due to low sample size (i.e. < ", n_train,")"))
n_train <- n_train*.8
low <- (sample_group - 1)*n_train + 1 # first row (of reordered data) to get
high <- low + n_train - 1 # last row
#stop('trying to sample from rows that dont exist')
}
reordered[low:high, ]
} else {
modDat_1
}
df_test <- if(fit_sample & !test_run &
# antijoin only works if there are enough rows that meet
# that criterion, i.e. if df_sample contains most of the data i
# doesnt' work
(nrow(modDat_1) - nrow(df_sample) > n_test)) {
modDat_1 %>%
anti_join(df_sample, by = c("cell_num", "year")) %>%
slice_sample(n = n_test)
} else {
modDat_1 %>%
slice_sample(n = n_test)
}
# small sample for certain plots
df_small <- slice_sample(modDat_1, n = 1e5)
}
create_summary <- function(df) {
df %>%
pivot_longer(cols = everything(),
names_to = 'variable') %>%
group_by(variable) %>%
summarise(across(value, .fns = list(mean = ~mean(.x, na.rm = TRUE), min = ~min(.x, na.rm = TRUE),
median = ~median(.x, na.rm = TRUE), max = ~max(.x, na.rm = TRUE)))) %>%
mutate(across(where(is.numeric), round, 4))
}
modDat_1[pred_vars] %>%
create_summary() %>%
knitr::kable(caption = 'summaries of predictor variables')
| variable | value_mean | value_min | value_median | value_max |
|---|---|---|---|---|
| PrecipTempCorr_meanAnnAvg_30yr | -0.2419 | -0.8556 | -0.2781 | 0.7145 |
| annWetDegDays_meanAnnAvg_30yr | 1604.5036 | 229.5051 | 1600.2231 | 2713.2516 |
| isothermality_meanAnnAvg_30yr | -37.4805 | -60.9401 | -36.9016 | -20.2171 |
| prcp_meanAnnTotal_30yr | 769.2664 | 52.1693 | 512.5627 | 4116.7860 |
| swe_meanAnnAvg_30yr | 18.5126 | 0.0000 | 3.6910 | 1435.7015 |
| tmean_meanAnnAvg_30yr | 6.7125 | 2.2173 | 6.8762 | 10.3473 |
response_summary <- modDat_1 %>%
dplyr::select(#where(is.numeric), -all_of(pred_vars),
matches(response)) %>%
create_summary()
kable(response_summary,
caption = 'summaries of response variables, calculated using paint')
| variable | value_mean | value_min | value_median | value_max |
|---|---|---|---|---|
| TotalGramCover_dec | 0.2318 | 1e-04 | 0.1188 | 1 |
here using pred dataframe, because smaller and this code is slow.
df_pred %>%
slice_sample(n = 5e4) %>%
#select(-matches("_")) %>%
ggpairs(lower = list(continuous = GGally::wrap("points", alpha = 0.1, size=0.2)),
columnLabels = c("swe", "tmean", "prcp", "prcpTempCorr", "isothermality", "wetDegDays"))
# vectors of names of response variables
vars_response <- response
# longformat dataframes for making boxplots
if (byRegion == FALSE) {
df_sample_plots <- df_sample %>%
rename(response = all_of(response)) %>%
mutate(response = case_when(
response <= .25 ~ ".25",
response > .25 & response <=.5 ~ ".5",
response > .5 & response <=.75 ~ ".75",
response >= .75 ~ "1",
)) %>%
select(c(response, pred_vars)) %>%
pivot_longer(cols = names(pred_vars),
names_to = "predictor",
values_to = "value") %>%
filter(!is.na(response) & !is.na(value))
} else {
df_sample_plots <- df_sample %>%
select(c(response, pred_vars, newRegion)) %>%
rename(response = all_of(response),
swe = swe_meanAnnAvg_30yr,
MAT = tmean_meanAnnAvg_30yr,
MAP = prcp_meanAnnTotal_30yr,
PrecipTempCorr = PrecipTempCorr_meanAnnAvg_30yr,
isothermality = isothermality_meanAnnAvg_30yr,
wetDegDays = annWetDegDays_meanAnnAvg_30yr
) %>%
mutate(response = case_when(
response <= .25 ~ ".25",
response > .25 & response <=.5 ~ ".5",
response > .5 & response <=.75 ~ ".75",
response >= .75 ~ "1",
)) %>%
pivot_longer(cols = c(swe, MAT, MAP, PrecipTempCorr, isothermality, wetDegDays),
names_to = "predictor",
values_to = "value") %>%
filter(!is.na(response) & !is.na(value))
}
# df_biome_long <-
# # for some reason select() was giving me problems
# # adding numYrs here so can take weighted average
# predvars2long(modDat_1, response_vars = c(vars_response),
# pred_vars = pred_vars) #%>%
# # mutate(across(all_of(vars_nfire), factor))
if (byRegion == FALSE) {
ggplot(df_sample_plots, aes_string(x= "response", y = 'value')) +
geom_boxplot() +
facet_wrap(~predictor, scales = 'free_y') +
ylab("Predictor Variable Values")
} else {
ggplot(df_sample_plots, aes_string(x= "response", y = 'value')) +
geom_boxplot() +
facet_wrap(~predictor + newRegion , scales = 'free_y') +
ylab("Predictor Variable Values")
}
Creating formulas where each variable on its own is transformed numerous ways (including formula to fit spline), all other variables are left alone, that repeated for each variable. So have models with 1 variable transformed, 2 transformed, etc.
see documentation for glms_iterate_transforms, in the modelling_functions.R script
set.seed(1234)
# adding an interaction term to help deal with over-predicting fire probability
# at pixels with high afgAGB and high MAP. parentheses around interaction
# term are included so that glms_iterate_transforms doesn't transform
# the interaction term.
#pred_vars_inter <- c(pred_vars, params$inter)
# pred_vars_inter <- pred_vars
mods_glm_transf_temp <- glmTransformsIterates(
preds = pred_vars,
df = df_sample,
response_var = response,
delta_aic = 10)
## [1] "step1"
##
## -50480.63
## -51344.49
## delta aic cutoff 10
## [1] "step2"
##
## -51344.49
## -52189.67
## delta aic cutoff 10
## [1] "step3"
##
## -52189.67
## -52699.41
## delta aic cutoff 10
## [1] "step4"
##
## -52699.41
## -52970.28
## delta aic cutoff 10
## [1] "step5"
##
## -52970.28
## -53092.49
## delta aic cutoff 10
## [1] "step6"
##
## -53092.49
## -53185.64
## delta aic cutoff 10
# not including the last element of the list which is final_formula
mods_glm_transf <- mods_glm_transf_temp[-length(mods_glm_transf_temp)]
map_dfc(mods_glm_transf, ~ names(.$aic[1:5])) %>%
kable(caption = "5 best transformations at each step")
| step1 | step2 | step3 | step4 | step5 | step6 |
|---|---|---|---|---|---|
| convert_poly2_annWetDegDays_meanAnnAvg_30yr | convert_poly2_isothermality_meanAnnAvg_30yr | convert_poly2sqrt_swe_meanAnnAvg_30yr | convert_poly2log10_PrecipTempCorr_meanAnnAvg_30yr | convert_poly2log10_prcp_meanAnnTotal_30yr | convert_poly2log10_tmean_meanAnnAvg_30yr |
| convert_poly2sqrt_annWetDegDays_meanAnnAvg_30yr | convert_poly2sqrt_swe_meanAnnAvg_30yr | convert_poly2log10_swe_meanAnnAvg_30yr | convert_log10_PrecipTempCorr_meanAnnAvg_30yr | convert_poly2log10_tmean_meanAnnAvg_30yr | convert_poly2sqrt_tmean_meanAnnAvg_30yr |
| convert_poly2_isothermality_meanAnnAvg_30yr | convert_poly2log10_PrecipTempCorr_meanAnnAvg_30yr | convert_log10_swe_meanAnnAvg_30yr | convert_poly2log10_tmean_meanAnnAvg_30yr | convert_poly2sqrt_tmean_meanAnnAvg_30yr | convert_poly2_tmean_meanAnnAvg_30yr |
| convert_poly2log10_annWetDegDays_meanAnnAvg_30yr | convert_poly2log10_swe_meanAnnAvg_30yr | convert_poly2log10_PrecipTempCorr_meanAnnAvg_30yr | convert_poly2sqrt_tmean_meanAnnAvg_30yr | convert_poly2_tmean_meanAnnAvg_30yr | convert_log10_tmean_meanAnnAvg_30yr |
| convert_poly2sqrt_swe_meanAnnAvg_30yr | convert_log10_swe_meanAnnAvg_30yr | convert_sqrt_swe_meanAnnAvg_30yr | convert_poly2_tmean_meanAnnAvg_30yr | convert_log10_tmean_meanAnnAvg_30yr | convert_sqrt_tmean_meanAnnAvg_30yr |
best_aic_by_step <- map_dbl(mods_glm_transf, ~.$aic[1])
best_aic_by_step <- c(step0 = mods_glm_transf_temp$step1$aic[['convert_none']],
best_aic_by_step)
# AIC improvement by step
diff(best_aic_by_step)
## step1 step2 step3 step4 step5 step6
## -863.85310 -845.17970 -509.73963 -270.87328 -122.20632 -93.15756
plot(y = best_aic_by_step,
x = 1:length(best_aic_by_step) - 1,
ylab = "AIC",
xlab = "Number of variables transformed")
if (byRegion == TRUE) {
## for western forests
mods_glm_transf_WF_temp <- glmTransformsIterates(
preds = pred_vars,
df = wf_sample,
response_var = response,
delta_aic = 10)
# not including the last element of the list which is final_formula
mods_glm_transf_WF <- mods_glm_transf_WF_temp[-length(mods_glm_transf_WF_temp)]
map_dfc(mods_glm_transf_WF, ~ names(.$aic[1:5])) %>%
kable(caption = "5 best transformations at each step")
best_aic_by_step_WF <- map_dbl(mods_glm_transf_WF, ~.$aic[1])
best_aic_by_step_WF <- c(step0 = mods_glm_transf_WF_temp$step1$aic[['convert_none']],
best_aic_by_step_WF)
# AIC improvement by step
diff(best_aic_by_step_WF)
plot(y = best_aic_by_step_WF,
x = 1:length(best_aic_by_step_WF) - 1,
ylab = "AIC",
xlab = "Number of variables transformed")
## for eastern forests
mods_glm_transf_EF_temp <- glmTransformsIterates(
preds = pred_vars,
df = ef_sample,
response_var = response,
delta_aic = 10)
# not including the last element of the list which is final_formula
mods_glm_transf_EF <- mods_glm_transf_EF_temp[-length(mods_glm_transf_EF_temp)]
map_dfc(mods_glm_transf_EF, ~ names(.$aic[1:5])) %>%
kable(caption = "5 best transformations at each step")
best_aic_by_step_EF <- map_dbl(mods_glm_transf_EF, ~.$aic[1])
best_aic_by_step_EF <- c(step0 = mods_glm_transf_EF_temp$step1$aic[['convert_none']],
best_aic_by_step_EF)
# AIC improvement by step
diff(best_aic_by_step_EF)
plot(y = best_aic_by_step_EF,
x = 1:length(best_aic_by_step_EF) - 1,
ylab = "AIC",
xlab = "Number of variables transformed")
## for grasslands/shrubs
mods_glm_transf_G_temp <- glmTransformsIterates(
preds = pred_vars,
df = g_sample,
response_var = response,
delta_aic = 10)
# not including the last element of the list which is final_formula
mods_glm_transf_G <- mods_glm_transf_G_temp[-length(mods_glm_transf_G_temp)]
map_dfc(mods_glm_transf_G, ~ names(.$aic[1:5])) %>%
kable(caption = "5 best transformations at each step")
best_aic_by_step_G <- map_dbl(mods_glm_transf_G, ~.$aic[1])
best_aic_by_step_G <- c(step0 = mods_glm_transf_G_temp$step1$aic[['convert_none']],
best_aic_by_step_G)
# AIC improvement by step
diff(best_aic_by_step_G)
plot(y = best_aic_by_step_G,
x = 1:length(best_aic_by_step_G) - 1,
ylab = "AIC",
xlab = "Number of variables transformed")
}
## [1] "step1"
##
## -114094.9
## -116318.9
## delta aic cutoff 10
## [1] "step2"
##
## -116318.9
## -116760
## delta aic cutoff 10
## [1] "step3"
##
## -116760
## -117024.8
## delta aic cutoff 10
## [1] "step4"
##
## -117024.8
## -117184.8
## delta aic cutoff 10
## [1] "step5"
##
## -117184.8
## -117337.3
## delta aic cutoff 10
## [1] "step6"
##
## -117337.3
## -117344.1
## delta aic cutoff 10
## [1] "step1"
##
## -34569.6
## -34933.28
## delta aic cutoff 10
## [1] "step2"
##
## -34933.28
## -35145.06
## delta aic cutoff 10
## [1] "step3"
##
## -35145.06
## -35252.78
## delta aic cutoff 10
## [1] "step4"
##
## -35252.78
## -35395.94
## delta aic cutoff 10
## [1] "step5"
##
## -35395.94
## -35460.97
## delta aic cutoff 10
## [1] "step6"
##
## -35460.97
## -35471.65
## delta aic cutoff 10
## [1] "step1"
## Error in chol.default(K) : the leading minor of order 8 is not positive
## Error in chol.default(K) : the leading minor of order 8 is not positive
## Error in chol.default(K) : the leading minor of order 8 is not positive
## Error in chol.default(K) : the leading minor of order 8 is not positive
## Error in chol.default(K) : the leading minor of order 8 is not positive
## Error in chol.default(K) : the leading minor of order 8 is not positive
## Error in chol.default(K) : the leading minor of order 8 is not positive
## Error in chol.default(K) : the leading minor of order 9 is not positive
## Error in chol.default(K) : the leading minor of order 9 is not positive
## Error in chol.default(K) : the leading minor of order 9 is not positive
## Error in chol.default(K) : the leading minor of order 9 is not positive
## Error in chol.default(K) : the leading minor of order 9 is not positive
## Error in chol.default(K) : the leading minor of order 9 is not positive
##
## 1e+07
## -37321.23
## delta aic cutoff 10
## [1] "step2"
## Error in chol.default(K) : the leading minor of order 9 is not positive
## Error in chol.default(K) : the leading minor of order 10 is not positive
## Error in chol.default(K) : the leading minor of order 10 is not positive
##
## -37321.23
## -38623.31
## delta aic cutoff 10
## [1] "step3"
##
## -38623.31
## -38969.54
## delta aic cutoff 10
## [1] "step4"
##
## -38969.54
## -39231.01
## delta aic cutoff 10
## [1] "step5"
##
## -39231.01
## -39235.35
## delta aic cutoff 10
Best transformation each step.
bestTransformed <- mods_glm_transf_temp$final_formula
if (byRegion == TRUE) {
# for western forests
#var_transformed_WF <- map(mods_glm_transf_WF, function(x) x$var_transformed)
bestTransformed_WF <- mods_glm_transf_WF_temp$final_formula
# for eastern forests
#var_transformed_EF <- map(mods_glm_transf_EF, function(x) x$var_transformed)
bestTransformed_EF <- mods_glm_transf_EF_temp$final_formula
# for grass/shrub
#var_transformed_G <- map(mods_glm_transf_G, function(x) x$var_transformed)
bestTransformed_G <- mods_glm_transf_G_temp$final_formula
}
Plot potential interaction terms using raw data
across values of swe
# calculate quantiles
swe_quants <- quantile(df_sample$swe_meanAnnAvg_30yr, na.rm = TRUE)
# longformat dataframe for making boxplots
df_sample_swe <- df_sample %>%
select(c(response, pred_vars)) %>%
# transform the predictors according to above process
mutate(isotherm = as.numeric(stats::poly(isothermality_meanAnnAvg_30yr,2,raw=TRUE)[,1]),
wetDegDays= as.numeric(stats::poly(annWetDegDays_meanAnnAvg_30yr,2,raw=TRUE)[,1]),
swe_log = as.numeric(stats::poly(I(log10(I(swe_meanAnnAvg_30yr+1))),2,raw=TRUE)[,1]),
#swe_log_squared = as.numeric(stats::poly(I(log10(I(swe_meanAnnAvg_30yr+1))),2,raw=TRUE)[,2]),
tmean_log = as.numeric(stats::poly(I(log10(I(tmean_meanAnnAvg_30yr+1))),2,raw=TRUE)[,1] ),
#tmean_log_squared = as.numeric(stats::poly(I(log10(I(tmean_meanAnnAvg_30yr+1))),2,raw=TRUE)[,2] ),
prcp_log = as.numeric(stats::poly(I(log10(I(prcp_meanAnnTotal_30yr+1))),2,raw=TRUE)[,1]),
# prcp_log_squared = as.numeric(stats::poly(I(log10(I(prcp_meanAnnTotal_30yr+1))),2,raw=TRUE)[,2]),
prcpTempCorr = as.numeric(stats::poly(PrecipTempCorr_meanAnnAvg_30yr,2,raw=TRUE)[,1])
) %>%
rename(response = all_of(response)) %>%
# mutate(response = case_when(
# response <= .25 ~ ".25",
# response > .25 & response <=.5 ~ ".5",
# response > .5 & response <=.75 ~ ".75",
# response >= .75 ~ "1",
# )) %>%
# pivot_longer(cols = 2:16,
# names_to = "predictor",
# values_to = "value") %>%
# filter(!is.na(response) & !is.na(value))
mutate(swe_meanAnnAvg_30yr_quants = case_when(
swe_meanAnnAvg_30yr <= swe_quants[2] ~ swe_quants[2],
swe_meanAnnAvg_30yr > swe_quants[2] & swe_meanAnnAvg_30yr <= swe_quants[3] ~ swe_quants[3],
swe_meanAnnAvg_30yr > swe_quants[3] & swe_meanAnnAvg_30yr <= swe_quants[4] ~ swe_quants[4],
swe_meanAnnAvg_30yr >= swe_quants[4] ~ swe_quants[5],
)) %>%
pivot_longer(cols = 2:13,
names_to = "predictor",
values_to = "value") %>%
filter(!is.na(response) & !is.na(value))
# plot relationships amongst variables according to different levels of SWE
ggplot(data = df_sample_swe[!(df_sample_swe$predictor %in% c("swe_log","swe_meanAnnAvg_30yr")),]) +
#geom_point(aes(y = response, x = value, col = as.factor(swe_meanAnnAvg_30yr_quants)), alpha = .5) +
geom_smooth(aes(y = response, x = value, col = as.factor(swe_meanAnnAvg_30yr_quants)), method = "gam") +
facet_wrap(~predictor, scales = "free_x")
Mean Annual Temperature
tmean_quants <- quantile(df_sample$tmean_meanAnnAvg_30yr, na.rm = TRUE)
# longformat dataframe for making boxplots
df_sample_tmean <- df_sample %>%
select(c(response, pred_vars)) %>%
# transform the predictors according to above process
mutate(isotherm = as.numeric(stats::poly(isothermality_meanAnnAvg_30yr,2,raw=TRUE)[,1]),
wetDegDays= as.numeric(stats::poly(annWetDegDays_meanAnnAvg_30yr,2,raw=TRUE)[,1]),
swe_log = as.numeric(stats::poly(I(log10(I(swe_meanAnnAvg_30yr+1))),2,raw=TRUE)[,1]),
#swe_log_squared = as.numeric(stats::poly(I(log10(I(swe_meanAnnAvg_30yr+1))),2,raw=TRUE)[,2]),
tmean_log = as.numeric(stats::poly(I(log10(I(tmean_meanAnnAvg_30yr+1))),2,raw=TRUE)[,1] ),
#tmean_log_squared = as.numeric(stats::poly(I(log10(I(tmean_meanAnnAvg_30yr+1))),2,raw=TRUE)[,2] ),
prcp_log = as.numeric(stats::poly(I(log10(I(prcp_meanAnnTotal_30yr+1))),2,raw=TRUE)[,1]),
# prcp_log_squared = as.numeric(stats::poly(I(log10(I(prcp_meanAnnTotal_30yr+1))),2,raw=TRUE)[,2]),
prcpTempCorr = as.numeric(stats::poly(PrecipTempCorr_meanAnnAvg_30yr,2,raw=TRUE)[,1])
) %>%
rename(response = all_of(response)) %>%
# mutate(response = case_when(
# response <= .25 ~ ".25",
# response > .25 & response <=.5 ~ ".5",
# response > .5 & response <=.75 ~ ".75",
# response >= .75 ~ "1",
# )) %>%
# pivot_longer(cols = 2:16,
# names_to = "predictor",
# values_to = "value") %>%
# filter(!is.na(response) & !is.na(value))
mutate(tmean_meanAnnAvg_30yr_quants = case_when(
tmean_meanAnnAvg_30yr <= tmean_quants[2] ~ tmean_quants[2],
tmean_meanAnnAvg_30yr > tmean_quants[2] & tmean_meanAnnAvg_30yr <= tmean_quants[3] ~ tmean_quants[3],
tmean_meanAnnAvg_30yr > tmean_quants[3] & tmean_meanAnnAvg_30yr <= tmean_quants[4] ~ tmean_quants[4],
tmean_meanAnnAvg_30yr >= tmean_quants[4] ~ tmean_quants[5],
)) %>%
pivot_longer(cols = 2:13,
names_to = "predictor",
values_to = "value") %>%
filter(!is.na(response) & !is.na(value))
# plot relationships amongst variables according to different levels of tmean
ggplot(data = df_sample_tmean[!(df_sample_tmean$predictor %in% c("tmean_log", "tmean_meanAnnAvg_30yr")),]) +
#geom_point(aes(y = response, x = value, col = as.factor(tmean_meanAnnAvg_30yr_quants)), alpha = .3) +
geom_smooth(aes(y = response, x = value, col = as.factor(tmean_meanAnnAvg_30yr_quants)), method = "gam") +
facet_wrap(~predictor, scales = "free_x")
Mean annual precipitation
prcp_quants <- quantile(df_sample$prcp_meanAnnTotal_30yr, na.rm = TRUE)
# longformat dataframe for making boxplots
df_sample_prcp <- df_sample %>%
select(c(response, pred_vars)) %>%
# transform the predictors according to above process
mutate(isotherm = as.numeric(stats::poly(isothermality_meanAnnAvg_30yr,2,raw=TRUE)[,1]),
wetDegDays= as.numeric(stats::poly(annWetDegDays_meanAnnAvg_30yr,2,raw=TRUE)[,1]),
swe_log = as.numeric(stats::poly(I(log10(I(swe_meanAnnAvg_30yr+1))),2,raw=TRUE)[,1]),
#swe_log_squared = as.numeric(stats::poly(I(log10(I(swe_meanAnnAvg_30yr+1))),2,raw=TRUE)[,2]),
tmean_log = as.numeric(stats::poly(I(log10(I(tmean_meanAnnAvg_30yr+1))),2,raw=TRUE)[,1] ),
#tmean_log_squared = as.numeric(stats::poly(I(log10(I(tmean_meanAnnAvg_30yr+1))),2,raw=TRUE)[,2] ),
prcp_log = as.numeric(stats::poly(I(log10(I(prcp_meanAnnTotal_30yr+1))),2,raw=TRUE)[,1]),
# prcp_log_squared = as.numeric(stats::poly(I(log10(I(prcp_meanAnnTotal_30yr+1))),2,raw=TRUE)[,2]),
prcpTempCorr = as.numeric(stats::poly(PrecipTempCorr_meanAnnAvg_30yr,2,raw=TRUE)[,1])
) %>%
rename(response = all_of(response)) %>%
# mutate(response = case_when(
# response <= .25 ~ ".25",
# response > .25 & response <=.5 ~ ".5",
# response > .5 & response <=.75 ~ ".75",
# response >= .75 ~ "1",
# )) %>%
# pivot_longer(cols = 2:16,
# names_to = "predictor",
# values_to = "value") %>%
# filter(!is.na(response) & !is.na(value))
mutate(prcp_meanAnnTotal_30yr_quants = case_when(
prcp_meanAnnTotal_30yr <= prcp_quants[2] ~ prcp_quants[2],
prcp_meanAnnTotal_30yr > prcp_quants[2] & prcp_meanAnnTotal_30yr <= prcp_quants[3] ~ prcp_quants[3],
prcp_meanAnnTotal_30yr > prcp_quants[3] & prcp_meanAnnTotal_30yr <= prcp_quants[4] ~ prcp_quants[4],
prcp_meanAnnTotal_30yr >= prcp_quants[4] ~ prcp_quants[5],
)) %>%
pivot_longer(cols = 2:13,
names_to = "predictor",
values_to = "value") %>%
filter(!is.na(response) & !is.na(value))
# plot relationships amongst variables according to different levels of tmean
ggplot(data = df_sample_prcp[!(df_sample_prcp$predictor %in% c("prcp_log", "prcp_meanAnnTotal_30yr")),]) +
#geom_point(aes(y = response, x = value, col = as.factor(prcp_meanAnnTotal_30yr_quants)), alpha = .3) +
geom_smooth(aes(y = response, x = value, col = as.factor(prcp_meanAnnTotal_30yr_quants)), method = "gam") +
facet_wrap(~predictor, scales = "free_x")
## make table of possible interactions
# make list of transformed variables into a data.frame
var_transformed_df <- as.data.frame(str_remove(bestTransformed, paste0(response, " ~ ")) %>%
str_split(pattern = fixed(" + "), simplify = TRUE))#purrr::flatten_dfr(var_transformed_WF) #%>%
# if has a "stats::poly( to start, remove that part
temp <- apply(var_transformed_df, MARGIN = 2, FUN = function(x) {
if(str_detect(x, pattern = fixed("stats::poly("))) {
x_new <- str_remove(x, pattern = fixed("stats::poly(")) %>%
str_remove_all(pattern = " ") %>%
str_remove(pattern = fixed(",2,raw=TRUE)"))
} else {
x_new <- x
}
})
# combine into the 32 possible interactions
combos <- data.frame(gtools::permutations(n = length(temp),
r = 2,
v = temp))
combos$interactions <- paste0(combos$X1, ":", combos$X2)
# make text string of interactions
int_string <-str_flatten(combos$interactions, collapse = " + ")
# formula w/ interactions included
modWithInteractions <- paste(bestTransformed, "+",int_string) %>%
str_remove_all(fixed("stats::"))
# refitting the glm with the best formula
mod_glm1 <- betareg(as.formula(modWithInteractions), data = df_sample, link = c("logit"), link.phi = NULL,
type = c("ML"))
if (byRegion == TRUE) {
# western forests
## make table of possible interactions
# make list of transformed variables into a data.frame
var_transformed_df_WF <- as.data.frame(str_remove(bestTransformed_WF, paste0(response, " ~ ")) %>%
str_split(pattern = fixed(" + "), simplify = TRUE))#purrr::flatten_dfr(var_transformed_WF) #%>%
# if has a "stats::poly( to start, remove that part
temp_WF <- apply(var_transformed_df_WF, MARGIN = 2, FUN = function(x) {
if(str_detect(x, pattern = fixed("stats::poly("))) {
x_new <- str_remove(x, pattern = fixed("stats::poly(")) %>%
str_remove_all(" ") %>%
str_remove(pattern = fixed(",2,raw=TRUE)"))
} else {
x_new <- x
}
})
# combine into the 32 possible interactions
combos_WF <- data.frame(gtools::permutations(n = length(temp_WF),
r = 2,
v = temp_WF))
combos_WF$interactions <- paste0(combos_WF$X1, ":", combos_WF$X2)
# make text string of interactions
int_string_WF <-str_flatten(combos_WF$interactions, collapse = " + ")
# formula w/ interactions included
modWithInteractions_WF <- paste(bestTransformed_WF, "+", int_string_WF) %>% str_remove_all(fixed("stats::"))
# refitting the glm with the best formula
mod_glm1_WF <- betareg(as.formula(modWithInteractions_WF), data = wf_sample, link = c("logit"), link.phi = NULL,
type = c("ML"))
# eastern forests
## make table of possible interactions
# make list of transformed variables into a data.frame
var_transformed_df_EF <- as.data.frame(str_remove(bestTransformed_EF, paste0(response, " ~ ")) %>%
str_split(pattern = fixed(" + "), simplify = TRUE))#purrr::flatten_dfr(var_transformed_EF) #%>%
# if has a "stats::poly( to start, remove that part
temp_EF <- apply(var_transformed_df_EF, MARGIN = 2, FUN = function(x) {
if(str_detect(x, pattern = fixed("stats::poly("))) {
x_new <- str_remove(x, pattern = fixed("stats::poly(")) %>%
str_remove_all(" ") %>%
str_remove(pattern = fixed(",2,raw=TRUE)"))
} else {
x_new <- x
}
})
# combine into the 30 possible interactions
combos_EF <- data.frame(gtools::permutations(n = length(temp_EF),
r = 2,
v = temp_EF))
combos_EF$interactions <- paste0(combos_EF$X1, ":", combos_EF$X2)
# make text string of interactions
int_string_EF <-str_flatten(combos_EF$interactions, collapse = " + ")
# formula w/ interactions included
modWithInteractions_EF <- paste(bestTransformed_EF, "+", int_string_EF) %>% str_remove_all(fixed("stats::"))
# refitting the glm with the best formula
mod_glm1_EF <- betareg(as.formula(modWithInteractions_EF), data = ef_sample, link = c("logit"), link.phi = NULL,
type = c("ML"))
## grassland/shrubs
## make table of possible interactions
# make list of transformed variables into a data.frame
var_transformed_df_G <- as.data.frame(str_remove(bestTransformed_G, paste0(response, " ~ ")) %>%
str_split(pattern = fixed(" + "), simplify = TRUE))#purrr::flatten_dfr(var_transformed_G) #%>%
# if has a "stats::poly( to start, remove that part
temp_G <- apply(var_transformed_df_G, MARGIN = 2, FUN = function(x) {
if(str_detect(x, pattern = fixed("stats::poly("))) {
x_new <- str_remove(x, pattern = fixed("stats::poly(")) %>%
str_remove_all(" ") %>%
str_remove(pattern = fixed(",2,raw=TRUE)"))
} else {
x_new <- x
}
})
# combine into the 30 possible interactions
combos_G <- data.frame(gtools::permutations(n = length(temp_G),
r = 2,
v = temp_G))
combos_G$interactions <- paste0(combos_G$X1, ":", combos_G$X2)
# make text string of interactions
int_string_G <-str_flatten(combos_G$interactions, collapse = " + ")
# formula w/ interactions included
modWithInteractions_G <- paste(bestTransformed_G, "+", int_string_G) %>% str_remove_all(fixed("stats::"))
# rGitting the glm with the best formula
mod_glm1_G <- betareg(as.formula(modWithInteractions_G), data = g_sample, link = c("logit"), link.phi = NULL,
type = c("ML"))
}
Using the “StepBeta” function from the StepBeta R package
modSelection <- StepBeta_mine(mod_glm1, data = df_sample)
## [1] "100 % of the process"
if (byRegion == TRUE) {
modSelection_WF <- StepBeta_mine(mod_glm1_WF, data = wf_sample)
modSelection_EF <- StepBeta_mine(mod_glm1_EF, data = ef_sample)
modSelection_G <- StepBeta_mine(mod_glm1_G, data = g_sample)
}
## [1] "100 % of the process"
## [1] "100 % of the process"
## [1] "100 % of the process"
## Error in chol.default(K) : the leading minor of order 14 is not positive
Response variables are the proportion of years in which fires occurred. Using best model formula selected in the previous step
best_form <- modSelection$formula
print(best_form)
## TotalGramCover_dec ~ poly(I(log10(I(PrecipTempCorr_meanAnnAvg_30yr +
## 1))), 2, raw = TRUE) + poly(annWetDegDays_meanAnnAvg_30yr,
## 2, raw = TRUE) + poly(isothermality_meanAnnAvg_30yr, 2, raw = TRUE) +
## poly(I(sqrt(swe_meanAnnAvg_30yr)), 2, raw = TRUE) + poly(I(log10(I(prcp_meanAnnTotal_30yr +
## 1))), 2, raw = TRUE) + poly(I(log10(I(tmean_meanAnnAvg_30yr +
## 1))), 2, raw = TRUE) + I(log10(I(tmean_meanAnnAvg_30yr +
## 1))):I(sqrt(swe_meanAnnAvg_30yr)) + I(log10(I(prcp_meanAnnTotal_30yr +
## 1))):I(log10(I(PrecipTempCorr_meanAnnAvg_30yr + 1))) + I(log10(I(prcp_meanAnnTotal_30yr +
## 1))):isothermality_meanAnnAvg_30yr + I(log10(I(PrecipTempCorr_meanAnnAvg_30yr +
## 1))):isothermality_meanAnnAvg_30yr + annWetDegDays_meanAnnAvg_30yr:I(log10(I(prcp_meanAnnTotal_30yr +
## 1))) + annWetDegDays_meanAnnAvg_30yr:I(log10(I(PrecipTempCorr_meanAnnAvg_30yr +
## 1))) + I(log10(I(tmean_meanAnnAvg_30yr + 1))):isothermality_meanAnnAvg_30yr +
## I(log10(I(PrecipTempCorr_meanAnnAvg_30yr + 1))):I(log10(I(tmean_meanAnnAvg_30yr +
## 1))) + I(log10(I(prcp_meanAnnTotal_30yr + 1))):I(log10(I(tmean_meanAnnAvg_30yr +
## 1))) + I(sqrt(swe_meanAnnAvg_30yr)):isothermality_meanAnnAvg_30yr +
## I(log10(I(prcp_meanAnnTotal_30yr + 1))):I(sqrt(swe_meanAnnAvg_30yr)) +
## annWetDegDays_meanAnnAvg_30yr:I(sqrt(swe_meanAnnAvg_30yr)) +
## I(log10(I(PrecipTempCorr_meanAnnAvg_30yr + 1))):I(sqrt(swe_meanAnnAvg_30yr)) +
## annWetDegDays_meanAnnAvg_30yr:isothermality_meanAnnAvg_30yr
## <environment: 0x3aac714e0>
# refitting the glm with the best formula
mod_glmFinal <- betareg(as.formula(best_form), data = df_sample, link = c("logit"), link.phi = NULL,
type = c("ML"))
# should be the same AIC (i.e. refitting the same model)
AIC(mod_glmFinal)
## [1] -57857.93
if (byRegion == TRUE) {
## western forests
best_form_WF <- modSelection_WF$formula
print(best_form_WF)
# refitting the glm with the best formula
mod_glmFinal_WF <- betareg(as.formula(best_form_WF), data = wf_sample, link = c("logit"), link.phi = NULL,
type = c("ML"))
# should be the same AIC (i.e. refitting the same model)
AIC(mod_glmFinal_WF)
## easter forests
best_form_EF <- modSelection_EF$formula
print(best_form_EF)
# refitting the glm with the best formula
mod_glmFinal_EF <- betareg(as.formula(best_form_EF), data = ef_sample, link = c("logit"), link.phi = NULL,
type = c("ML"))
# should be the same AIC (i.e. refitting the same model)
AIC(mod_glmFinal_EF)
## grass/shrub
best_form_G <- modSelection_G$formula
print(best_form_G)
# refitting the glm with the best formula
mod_glmFinal_G <- betareg(as.formula(best_form_G), data = g_sample, link = c("logit"), link.phi = NULL,
type = c("ML"))
# should be the same AIC (i.e. refitting the same model)
AIC(mod_glmFinal_G)
}
## TotalGramCover_dec ~ poly(I(sqrt(prcp_meanAnnTotal_30yr)), 2,
## raw = TRUE) + poly(isothermality_meanAnnAvg_30yr, 2, raw = TRUE) +
## poly(PrecipTempCorr_meanAnnAvg_30yr, 2, raw = TRUE) + annWetDegDays_meanAnnAvg_30yr +
## poly(I(sqrt(swe_meanAnnAvg_30yr)), 2, raw = TRUE) + poly(tmean_meanAnnAvg_30yr,
## 2, raw = TRUE) + I(sqrt(prcp_meanAnnTotal_30yr)):isothermality_meanAnnAvg_30yr +
## isothermality_meanAnnAvg_30yr:PrecipTempCorr_meanAnnAvg_30yr +
## I(sqrt(prcp_meanAnnTotal_30yr)):I(sqrt(swe_meanAnnAvg_30yr)) +
## annWetDegDays_meanAnnAvg_30yr:I(sqrt(prcp_meanAnnTotal_30yr)) +
## I(sqrt(prcp_meanAnnTotal_30yr)):PrecipTempCorr_meanAnnAvg_30yr +
## I(sqrt(prcp_meanAnnTotal_30yr)):tmean_meanAnnAvg_30yr + annWetDegDays_meanAnnAvg_30yr:I(sqrt(swe_meanAnnAvg_30yr)) +
## I(sqrt(swe_meanAnnAvg_30yr)):PrecipTempCorr_meanAnnAvg_30yr +
## annWetDegDays_meanAnnAvg_30yr:tmean_meanAnnAvg_30yr + I(sqrt(swe_meanAnnAvg_30yr)):isothermality_meanAnnAvg_30yr +
## I(sqrt(swe_meanAnnAvg_30yr)):tmean_meanAnnAvg_30yr
## <environment: 0x3abfc8498>
## TotalGramCover_dec ~ poly(annWetDegDays_meanAnnAvg_30yr, 2, raw = TRUE) +
## poly(I(log10(I(tmean_meanAnnAvg_30yr + 1))), 2, raw = TRUE) +
## poly(prcp_meanAnnTotal_30yr, 2, raw = TRUE) + poly(isothermality_meanAnnAvg_30yr,
## 2, raw = TRUE) + poly(I(log10(I(swe_meanAnnAvg_30yr + 1))),
## 2, raw = TRUE) + poly(PrecipTempCorr_meanAnnAvg_30yr, 2,
## raw = TRUE) + annWetDegDays_meanAnnAvg_30yr:prcp_meanAnnTotal_30yr +
## I(log10(I(swe_meanAnnAvg_30yr + 1))):prcp_meanAnnTotal_30yr +
## I(log10(I(tmean_meanAnnAvg_30yr + 1))):PrecipTempCorr_meanAnnAvg_30yr +
## isothermality_meanAnnAvg_30yr:prcp_meanAnnTotal_30yr + I(log10(I(tmean_meanAnnAvg_30yr +
## 1))):prcp_meanAnnTotal_30yr + annWetDegDays_meanAnnAvg_30yr:PrecipTempCorr_meanAnnAvg_30yr +
## I(log10(I(swe_meanAnnAvg_30yr + 1))):isothermality_meanAnnAvg_30yr +
## I(log10(I(swe_meanAnnAvg_30yr + 1))):I(log10(I(tmean_meanAnnAvg_30yr +
## 1))) + prcp_meanAnnTotal_30yr:PrecipTempCorr_meanAnnAvg_30yr +
## I(log10(I(swe_meanAnnAvg_30yr + 1))):PrecipTempCorr_meanAnnAvg_30yr +
## annWetDegDays_meanAnnAvg_30yr:I(log10(I(tmean_meanAnnAvg_30yr +
## 1))) + annWetDegDays_meanAnnAvg_30yr:I(log10(I(swe_meanAnnAvg_30yr +
## 1))) + annWetDegDays_meanAnnAvg_30yr:isothermality_meanAnnAvg_30yr +
## I(log10(I(tmean_meanAnnAvg_30yr + 1))):isothermality_meanAnnAvg_30yr
## <environment: 0x3b7a29158>
## TotalGramCover_dec ~ isothermality_meanAnnAvg_30yr + poly(prcp_meanAnnTotal_30yr,
## 2, raw = TRUE) + poly(PrecipTempCorr_meanAnnAvg_30yr, 2,
## raw = TRUE) + poly(annWetDegDays_meanAnnAvg_30yr, 2, raw = TRUE) +
## poly(I(log10(I(tmean_meanAnnAvg_30yr + 1))), 2, raw = TRUE) +
## swe_meanAnnAvg_30yr + isothermality_meanAnnAvg_30yr:PrecipTempCorr_meanAnnAvg_30yr +
## swe_meanAnnAvg_30yr:I(log10(I(tmean_meanAnnAvg_30yr + 1))) +
## prcp_meanAnnTotal_30yr:PrecipTempCorr_meanAnnAvg_30yr + isothermality_meanAnnAvg_30yr:prcp_meanAnnTotal_30yr +
## isothermality_meanAnnAvg_30yr:annWetDegDays_meanAnnAvg_30yr +
## annWetDegDays_meanAnnAvg_30yr:PrecipTempCorr_meanAnnAvg_30yr +
## I(log10(I(tmean_meanAnnAvg_30yr + 1))):prcp_meanAnnTotal_30yr +
## annWetDegDays_meanAnnAvg_30yr:I(log10(I(tmean_meanAnnAvg_30yr +
## 1))) + swe_meanAnnAvg_30yr:annWetDegDays_meanAnnAvg_30yr +
## isothermality_meanAnnAvg_30yr:I(log10(I(tmean_meanAnnAvg_30yr +
## 1))) + I(log10(I(tmean_meanAnnAvg_30yr + 1))):PrecipTempCorr_meanAnnAvg_30yr +
## annWetDegDays_meanAnnAvg_30yr:prcp_meanAnnTotal_30yr + swe_meanAnnAvg_30yr:prcp_meanAnnTotal_30yr
## <environment: 0x3b846c710>
## [1] -40966.46
PDP plot trend made using a small sample of the data
#vip::vip(mod_glmFinal, num_features = 15)
#pdp_all_vars(mod_glmFinal, mod_vars = pred_vars, ylab = 'probability',train = df_small)
#caret::varImp(mod_glmFinal)
Predicting on the data
# create prediction for each each model
# (i.e. for each fire proporation variable)
predict_by_response <- function(mod, df) {
df_out <- df
response_name <- paste0(response, "_pred")
df_out[[response_name]] <- predict(mod, df, type = 'response')
df_out
}
pred_glm1 <- predict_by_response(mod_glmFinal, df_test)
if (byRegion == TRUE) {
## western forests
# create prediction
pred_glm1_WF <- predict_by_response(mod_glmFinal_WF, wf_test)
## eastern forests
# create prediction
pred_glm1_EF <- predict_by_response(mod_glmFinal_EF, ef_test)
## grass/shrub
# create prediction
pred_glm1_G <- predict_by_response(mod_glmFinal_G, g_test)
## add predictions together for later figures
pred_glm1_ALL <- rbind(pred_glm1_WF, pred_glm1_EF, pred_glm1_G)
}
For CONUS-wide model
pred_glm1 <- pred_glm1 %>%
mutate(resid = .[[response]] - .[[paste0(response,"_pred")]])
# rasterize
# get reference raster
test_rast <- rast("../../../data/dayMet/rawMonthlyData/orders/70e0da02b9d2d6e8faa8c97d211f3546/Daymet_Monthly_V4R1/data/daymet_v4_prcp_monttl_na_1980.tif") %>%
terra::aggregate(fact = 32, fun = "mean")
# rasterize data
plotResid_rast <- pred_glm1 %>%
drop_na(resid) %>%
#slice_sample(n = 5e4) %>%
terra::vect(geom = c("Lon", "Lat")) %>%
terra::set.crs(crs(test_rast)) %>%
terra::rasterize(y = test_rast,
field = "resid",
fun = mean) %>%
terra::crop(ext(-2000000, 2500000, -2000000, 1200000))
# plot
ggplot() +
geom_spatraster(data = plotResid_rast) +
ggtitle(paste0("Resids. from Best CONUS wide model of ",s)) +
scale_fill_viridis_c(option = "turbo", limits = c(-.55, .91), na.value = "white")
# terra::plot(plotResid_rast, main = paste0("Resids. from Best CONUS wide model of ",s), clip = TRUE,
# plg = list(title = "resid."),
# col = map.pal("curvature"))
If applicable, residual plots of region-level models
if (byRegion == TRUE){
pred_glm1_ALL <- pred_glm1_ALL %>%
mutate(resid = .[[response]] - .[[paste0(response,"_pred")]])
# rasterize
# rasterize data
plotResid_rast_ALL <- pred_glm1_ALL %>%
drop_na(resid) %>%
#slice_sample(n = 5e4) %>%
terra::vect(geom = c("Lon", "Lat")) %>%
terra::set.crs(crs(test_rast)) %>%
terra::rasterize(y = test_rast,
field = "resid",
fun = mean) %>%
terra::crop(ext(-2000000, 2500000, -2000000, 1200000))
# plot
ggplot() +
geom_spatraster(data = plotResid_rast_ALL) +
ggtitle(paste0("Resids. from regional models of ",s)) +
scale_fill_viridis_c(option = "turbo", limits = c(-.55, .91), na.value = "white")
}
if (byRegion == TRUE){
## for western forests
pred_glm1_WF <- pred_glm1_WF %>%
mutate(resid = .[[response]] - .[[paste0(response,"_pred")]])
# rasterize
# rasterize data
plotResid_rast_WF <- pred_glm1_WF %>%
drop_na(resid) %>%
#slice_sample(n = 5e4) %>%
terra::vect(geom = c("Lon", "Lat")) %>%
terra::set.crs(crs(test_rast)) %>%
terra::rasterize(y = test_rast,
field = "resid",
fun = mean) %>%
terra::crop(ext(-2000000, 0, -1500000, 1200000))
westForestRast <- ggplot() +
geom_spatraster(data = plotResid_rast_WF) +
ggtitle(paste0("Resids. from western forest-wide model of ",s)) +
scale_fill_viridis_c(option = "turbo", limits = c(-.55, .91), na.value = "white")
## for eastern forests
pred_glm1_EF <- pred_glm1_EF %>%
mutate(resid = .[[response]] - .[[paste0(response,"_pred")]])
# rasterize data
plotResid_rast_EF <- pred_glm1_EF %>%
drop_na(resid) %>%
#slice_sample(n = 5e4) %>%
terra::vect(geom = c("Lon", "Lat")) %>%
terra::set.crs(crs(test_rast)) %>%
terra::rasterize(y = test_rast,
field = "resid",
fun = mean) %>%
terra::crop(ext(0, 2500000, -1700000, 1200000))
EastForestRast <- ggplot() +
geom_spatraster(data = plotResid_rast_EF) +
ggtitle(paste0("Resids. from eastern forest-wide model of ",s)) +
scale_fill_viridis_c(option = "turbo", limits = c(-.55, .91), na.value = "white")
## for shrub/grassland
pred_glm1_G <- pred_glm1_G %>%
mutate(resid = .[[response]] - .[[paste0(response,"_pred")]])
# rasterize data
plotResid_rast_G <- pred_glm1_G %>%
drop_na(resid) %>%
#slice_sample(n = 5e4) %>%
terra::vect(geom = c("Lon", "Lat")) %>%
terra::set.crs(crs(test_rast)) %>%
terra::rasterize(y = test_rast,
field = "resid",
fun = mean) %>%
terra::crop(ext(-2000000, 900000, -1900000, 1000000))
grassRast <- ggplot() +
geom_spatraster(data = plotResid_rast_G) +
ggtitle(paste0("Resids. from eastern grass/shrubland-wide model of ",s)) +
scale_fill_viridis_c(option = "turbo", limits = c(-.55, .91), na.value = "white")
westForestRast / EastForestRast / grassRast
}
Binning predictor variables into deciles (actually percentiles) and looking at the mean predicted probability for each percentile. The use of the word deciles is just a legacy thing (they started out being actual deciles)
Then predicting on an identical dataset but with warming
var_prop_pred <- paste0(response, "_pred")
response_vars <- c(response, var_prop_pred)
pred_glm1_deciles <- predvars2deciles(pred_glm1,
response_vars = response_vars,
pred_vars = pred_vars)
if (byRegion == TRUE) {
## western forests
pred_glm1_deciles_WF <- predvars2deciles(pred_glm1_WF,
response_vars = response_vars,
pred_vars = pred_vars) %>%
mutate(newRegion = "westernForest")
## eastern forests
pred_glm1_deciles_EF <- predvars2deciles(pred_glm1_EF,
response_vars = response_vars,
pred_vars = pred_vars) %>%
mutate(newRegion = "easternForest")
## grass/shrub
pred_glm1_deciles_G <- predvars2deciles(pred_glm1_G,
response_vars = response_vars,
pred_vars = pred_vars) %>%
mutate(newRegion = "grassShrub")
## put together for later figures
pred_glm1_deciles_ALL <- predvars2deciles(pred_glm1_ALL,
response_vars = response_vars,
pred_vars = pred_vars)
}
Publication quality quantile plot
# publication quality version
g3 <- decile_dotplot_pq(pred_glm1_deciles, response = response) + ggtitle("Decile Plot for CONUS-wide model")
if(!hmod) {
# obs/pred inset
g4 <- add_dotplot_inset(g3, pred_glm1_deciles)
} else {
g4 <- g3
}
if (byRegion == FALSE){
g4
}
if(save_figs) {
png(paste0("figures/quantile_plots/quantile_plot_v5_CONUS_wideModel_", s, ".png"),
units = "in", res = 600, width = 5.5, height = 3.5 )
print(g2)
dev.off()
}
if (byRegion == TRUE) {
# publication quality version
g <- decile_dotplot_pq(pred_glm1_deciles_ALL, response = response) + ggtitle("Decile Plot for ecoregion-level model")
if(!hmod) {
# obs/pred inset
g2 <- add_dotplot_inset(g, pred_glm1_deciles_ALL)
} else {
g2 <- g
}
if(save_figs) {
png(paste0("figures/quantile_plots/quantile_plot_v5_regionLevelModel", s, ".png"),
units = "in", res = 600, width = 5.5, height = 3.5 )
print(g2)
dev.off()
}
g4/g2
}
20th and 80th percentiles for each climate variable
df <- pred_glm1[, pred_vars] #%>%
#mutate(MAT = MAT - 273.15) # k to c
map(df, quantile, probs = c(0.2, 0.8), na.rm = TRUE)
## $swe_meanAnnAvg_30yr
## 20% 80%
## 0.02858868 20.36554599
##
## $tmean_meanAnnAvg_30yr
## 20% 80%
## 5.598118 7.775022
##
## $prcp_meanAnnTotal_30yr
## 20% 80%
## 324.0229 1283.0649
##
## $PrecipTempCorr_meanAnnAvg_30yr
## 20% 80%
## -0.6771556 0.1923490
##
## $isothermality_meanAnnAvg_30yr
## 20% 80%
## -41.67058 -33.55516
##
## $annWetDegDays_meanAnnAvg_30yr
## 20% 80%
## 1327.333 1912.278
Filtered ‘Decile’ plots of data. These plots show each vegetation variable, but only based on data that falls into the upper and lower two deciles of each climate variable.
clim_vars <- c("swe_meanAnnAvg_30yr", "tmean_meanAnnAvg_30yr", "prcp_meanAnnTotal_30yr", "precip_Seasonality_meanAnnAvg_30yr", "isothermality_meanAnnAvg_30yr", "annWetDegDays_meanAnnAvg_30yr")
pred_glm1_deciles_filt <- predvars2deciles( pred_glm1,
response_vars = response_vars,
pred_vars = pred_vars,
filter_var = TRUE,
filter_vars = pred_vars)
decile_dotplot_filtered_pq(pred_glm1_deciles_filt, xvars = clim_vars)
#decile_dotplot_filtered_pq(pred_glm1_deciles_filt)
Filtered quantile figure with middle 2 deciles also shown (this is very memory intensive so no running at the moment)
pred_glm1_deciles_filt_mid <- predvars2deciles(pred_glm1,
response_vars = response_vars,
pred_vars = pred_vars,
filter_vars = pred_vars,
filter_var = TRUE,
add_mid = TRUE)
g <- decile_dotplot_filtered_pq(df = pred_glm1_deciles_filt_mid, xvars = pred_vars)
g
if(save_figs) {x
jpeg(paste0("figures/quantile_plots/quantile_plot_filtered_mid_v1", s, ".jpeg"),
units = "in", res = 600, width = 5.5, height = 6 )
g
dev.off()
}
# glm models
mods2save <- butcher::butcher(mod_glmFinal) # removes some model components so the saved object isn't huge
mods2save$formula <- best_form
#mods2save$pred_vars_inter <- pred_vars_inter # so have interactions
n <- nrow(df_sample)
mods2save$data_rows <- n
if (byRegion == TRUE){
mods2save_WF <- butcher::butcher(mod_glmFinal_WF) # removes some model components so the saved object isn't huge
mods2save_WF$formula <- best_form_WF
#mods2save$pred_vars_inter <- pred_vars_inter # so have interactions
mods2save_WF$data_rows <- nrow(wf_sample)
mods2save_EF <- butcher::butcher(mod_glmFinal_EF) # removes some model components so the saved object isn't huge
mods2save_EF$formula <- best_form_EF
#mods2save$pred_vars_inter <- pred_vars_inter # so have interactions
mods2save_EF$data_rows <- nrow(ef_sample)
mods2save_G <- butcher::butcher(mod_glmFinal_G) # removes some model components so the saved object isn't huge
mods2save_G$formula <- best_form_G
#mods2save$pred_vars_inter <- pred_vars_inter # so have interactions
mods2save_G$data_rows <- nrow(g_sample)
}
if(!test_run) {
saveRDS(mods2save,
paste0("./models/glm_beta_model_CONUSwide_", s, "_n", n,
#sample_group,
".RDS"))
if (byRegion == TRUE) {
## western forests
saveRDS(mods2save_WF,
paste0("./models/glm_beta_model_WesternForests_", s, "_n", n,
#sample_group,
".RDS"))
## eastern forests
saveRDS(mods2save_EF,
paste0("./models/glm_beta_model_EasternForests_", s, "_n", n,
#sample_group,
".RDS"))
## grass/shrub
saveRDS(mods2save_G,
paste0("./models/glm_beta_model_GrassShrub_", s, "_n", n,
#sample_group,
".RDS"))
}
}
Hash of current commit (i.e. to ID the version of the code used)
system("git rev-parse HEAD", intern=TRUE)
## [1] "4b96609b6a605eb50ac34314dfd06b67889bd7fc"
Packages etc.
sessionInfo()
## R version 4.4.0 (2024-04-24)
## Platform: aarch64-apple-darwin20
## Running under: macOS Sonoma 14.4.1
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/Denver
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] StepBeta_2.1.0 ggtext_0.1.2 knitr_1.46 gridExtra_2.3
## [5] pdp_0.8.1 GGally_2.2.1 lubridate_1.9.3 forcats_1.0.0
## [9] dplyr_1.1.4 purrr_1.0.2 readr_2.1.5 tidyr_1.3.1
## [13] tibble_3.2.1 tidyverse_2.0.0 betareg_3.1-4 caret_6.0-94
## [17] lattice_0.22-6 ggplot2_3.5.1 sf_1.0-16 tidyterra_0.6.1
## [21] terra_1.7-78 dtplyr_1.3.1 patchwork_1.2.0 stringr_1.5.1
##
## loaded via a namespace (and not attached):
## [1] DBI_1.2.3 pROC_1.18.5 sandwich_3.1-0
## [4] rlang_1.1.4 magrittr_2.0.3 e1071_1.7-14
## [7] compiler_4.4.0 mgcv_1.9-1 flexmix_2.3-19
## [10] vctrs_0.6.5 reshape2_1.4.4 combinat_0.0-8
## [13] pkgconfig_2.0.3 fastmap_1.2.0 labeling_0.4.3
## [16] utf8_1.2.4 rmarkdown_2.27 markdown_1.13
## [19] prodlim_2024.06.25 tzdb_0.4.0 xfun_0.44
## [22] modeltools_0.2-23 cachem_1.1.0 jsonlite_1.8.8
## [25] recipes_1.1.0 highr_0.10 parallel_4.4.0
## [28] R6_2.5.1 bslib_0.7.0 stringi_1.8.4
## [31] RColorBrewer_1.1-3 parallelly_1.37.1 rpart_4.1.23
## [34] lmtest_0.9-40 jquerylib_0.1.4 Rcpp_1.0.12
## [37] iterators_1.0.14 future.apply_1.11.2 zoo_1.8-12
## [40] butcher_0.3.4 Matrix_1.7-0 splines_4.4.0
## [43] nnet_7.3-19 timechange_0.3.0 tidyselect_1.2.1
## [46] rstudioapi_0.16.0 yaml_2.3.8 timeDate_4032.109
## [49] codetools_0.2-20 listenv_0.9.1 plyr_1.8.9
## [52] withr_3.0.0 evaluate_0.23 future_1.33.2
## [55] survival_3.5-8 ggstats_0.6.0 units_0.8-5
## [58] proxy_0.4-27 xml2_1.3.6 pillar_1.9.0
## [61] KernSmooth_2.23-22 foreach_1.5.2 stats4_4.4.0
## [64] generics_0.1.3 hms_1.1.3 commonmark_1.9.1
## [67] aod_1.3.3 munsell_0.5.1 scales_1.3.0
## [70] gtools_3.9.5 globals_0.16.3 class_7.3-22
## [73] glue_1.7.0 tools_4.4.0 data.table_1.15.4
## [76] ModelMetrics_1.2.2.2 gower_1.0.1 grid_4.4.0
## [79] ipred_0.9-15 colorspace_2.1-0 nlme_3.1-164
## [82] Formula_1.2-5 cli_3.6.2 fansi_1.0.6
## [85] viridisLite_0.4.2 lava_1.8.0 gtable_0.3.5
## [88] sass_0.4.9 digest_0.6.35 classInt_0.4-10
## [91] farver_2.1.2 htmltools_0.5.8.1 lifecycle_1.0.4
## [94] hardhat_1.4.0 gridtext_0.1.5 MASS_7.3-60.2